lm_plotly_model <- function(DF){
  plotly_plot <- DF%>%
  ggplot(aes(x = (avg_sars_cov2_conc), 
             y = (case_rate),
             color = n1_sars_cov2_lod,
             fill = n2_sars_cov2_lod))+
  geom_point()+
  geom_smooth()+
  ggtitle("Strong Linear Relationship Between Log Cases and Log COVID Concentration")+
  xlab("log(Covid-19 concentration)")+
  ylab("log(case rate)")

  return(plotly::ggplotly(plotly_plot))#
}

num_samples <- 16000
for(A in c(1,3,5)){
  X <- 2 * rnorm(num_samples) + 10
  yn_exp <- rnorm(num_samples)
  yn_lin <- exp(2*A)*rnorm(num_samples)
  Y <- exp(X + yn_exp) + yn_lin
  xn_exp <- rnorm(num_samples)
  xn_lin <- exp(1*A)*rnorm(num_samples)
  X <- exp(X + xn_exp)# + xn_lin
  #plot(X, Y)
  plot(log(X), log(Y))
}

library(DSIWastewater)
data(pop_data, package = "DSIWastewater")
data(Covariants_data, package = "DSIWastewater")
#convert covariant info to most prevalent covariant to make more efficient
t_df <- Covariants_data%>%#remove info about total number of genes
  select(-total_sequences)

t_df$main_variant<-colnames(t_df)[apply(t_df, 1, which.max)]#add column saying max value



covar_floor_df <- t_df%>%#add merge variable to merge weekly data to daily+ data
  mutate(dumby_merge_var = floor(as.numeric(lubridate::ymd(week)) / 7),
         main_variant = as.factor(main_variant))%>%
  select(dumby_merge_var, main_variant)


data(pop_data, package = "DSIWastewater")
data(Covariants_data, package = "DSIWastewater")
Graph_DF <- data.select.1%>%
  left_join(rename(pop_data, wwtp_name = site), by = c("wwtp_name"))%>%
  filter(case_rate != 0,
         !is.na(pop))%>%
  mutate(avg_sars_cov2_conc = log(avg_sars_cov2_conc + 1),
         case_rate = log(case_rate + 1),
         LOD = (n1_sars_cov2_lod == "No")&(n2_sars_cov2_lod == "No"))%>%
  mutate(dumby_merge_var = floor((floor(as.numeric(sample_collect_date) / 7) - 1)/2)*2 + 1 )%>%
  left_join(covar_floor_df, by = c("dumby_merge_var"))%>%
  select(-dumby_merge_var)

Graph_DF$pop_group <- as.factor(ntile(Graph_DF$pop, 4))



Graph_DF2 <- Graph_DF%>%
    group_by(wwtp_name)%>%
         mutate(avg_sars_cov2_conc = (avg_sars_cov2_conc -  mean(avg_sars_cov2_conc))/ sd(avg_sars_cov2_conc),
                #case_rate = exp(case_rate),
         case_rate = (case_rate - mean(case_rate))/sd(case_rate))%>%
  ungroup()

df_LOD <- Graph_DF%>%
  filter(n1_sars_cov2_lod == "No",
         n2_sars_cov2_lod == "No")
df_LOD2 <- Graph_DF2%>%
  filter(n1_sars_cov2_lod == "No",
         n2_sars_cov2_lod == "No")
lm_plotly_model(Graph_DF)
lm_plotly_model(Graph_DF2)
#M(t) = unknown function of T. Number of sick people at time t
#C(t) = e_l + a * M(t)*e_m + e_l
#W(t) = e_li + b * M(t)*e_e 



plotly::ggplotly(
  Graph_DF%>%
    ggplot(aes(x = (avg_sars_cov2_conc), 
               y = (case_rate),
               fill = pop_group,
               color = LOD))+
    geom_point()+
    #geom_smooth(method = "lm")+
    ggtitle("Strong Linear Relationship Between Log Cases and Log COVID Concentration")+
    xlab("log(Covid-19 concentration)")+
    ylab("log(case rate)")
)
plotly::ggplotly(
  Graph_DF%>%
    ggplot(aes(x = (avg_sars_cov2_conc), 
               y = (case_rate),
               fill = main_variant,
               color = LOD))+
    geom_point()+
    #geom_smooth(method = "lm")+
    ggtitle("Strong Linear Relationship Between Log Cases and Log COVID Concentration")+
    xlab("log(Covid-19 concentration)")+
    ylab("log(case rate)")
)
gen_R2_params <- function(lm_formula, df){
  lm_run <- lm(lm_formula, data = df)
  return(c(summary(lm_run)$adj.r.squared, length(coef(lm_run))))
}

#main_variant
full_formula <- case_rate ~ avg_sars_cov2_conc:LOD:pop_group:main_variant + LOD:pop_group:main_variant
#
lm_full_inder_formula <- case_rate ~ avg_sars_cov2_conc:LOD:pop_group +  avg_sars_cov2_conc:LOD:main_variant + LOD:pop_group + LOD:main_variant
#
lm_true_formula <- case_rate ~ avg_sars_cov2_conc:pop_group:main_variant + pop_group:main_variant
#
lm_true_inder_formula <- case_rate ~ avg_sars_cov2_conc:pop_group + avg_sars_cov2_conc:main_variant + pop_group + main_variant
##
lm_norm_formula <- case_rate ~ avg_sars_cov2_conc:LOD:main_variant + LOD:main_variant
##
lm_min_formula <- case_rate ~ avg_sars_cov2_conc:main_variant + main_variant
{
  output_DF <- data.frame("Type" = c("ajed R^2", "num_param"),
             "full model" = gen_R2_params(full_formula, Graph_DF),
             "full indirect model" = gen_R2_params(lm_full_inder_formula, Graph_DF),
             "true model" = gen_R2_params(lm_true_formula, df_LOD),
             "true indirect model"= gen_R2_params(lm_true_inder_formula, df_LOD),
             "norm site model" = gen_R2_params(lm_norm_formula, Graph_DF2),
             "true norm site model" = gen_R2_params(lm_min_formula, df_LOD2))
  
  n <- output_DF$Type
  output_DF <- as.data.frame(t(output_DF[,-1]))
  colnames(output_DF) <- n
  output_DF
}
##                       ajed R^2 num_param
## full.model           0.6465076        81
## full.indirect.model  0.6339182        33
## true.model           0.6158864        41
## true.indirect.model  0.6059433        16
## norm.site.model      0.5726986        21
## true.norm.site.model 0.5315432        10

/////////////////////////////////////// // HFG Work ///////////////////////////////////////

data("HFGWaste_data", package = "DSIWastewater")
data(Case_data , package = "DSIWastewater")

#crazy agressive method
hfg_outlier_detection <- function(small_vec){
  sortedVec <- sort(log(small_vec))
  lower_quant <- sortedVec[4]
  upper_quant <- sortedVec[6]
  range <- upper_quant - lower_quant
  retVec = ifelse(log(small_vec) > upper_quant + 1.5 * range,
                  NA, small_vec)
  retVec = ifelse(log(small_vec) < lower_quant - 1.5 * range,
                  NA, small_vec)
  retVec = ifelse(is.infinite(retVec), NA, retVec)
  return(retVec)
}

Pop_DF <- data.frame(
  site = c("Hudson","Kenosha","Platteville","Madison","Merrill","Plymouth","River Falls","Sun Prairie","Wausau","Oshkosh","Wausau"),
  pop = c(19680,122000,14000,380000,10000,9000,16000,34926,42000,67000,42000)
)

hfg_waste_filt_df <- HFGWaste_data%>%
  select(site, date, Filter, Well, N1, N2, PMMOV, HF183, CrP, everything())%>%
  group_by(date, site)%>%
  #mutate(across(N1:CrP, hfg_outlier_detection))%>%
  left_join(Pop_DF)
trend_df <- hfg_waste_filt_df%>%
  group_by(site)%>%
  group_split()%>%
  lapply(loessSmoothMod, InVar = "N1", OutVar = "Trend_N1")%>%
  lapply(loessSmoothMod, InVar = "N2", OutVar = "Trend_N2")%>%
  lapply(loessSmoothMod, InVar = "PMMOV", OutVar = "Trend_PMMOV")%>%
  lapply(loessSmoothMod, InVar = "HF183", OutVar = "Trend_HF183")%>%
  lapply(loessSmoothMod, InVar = "CrP", OutVar = "Trend_CrP")%>%
  bind_rows()%>%
  mutate(
    Diff_N1 = Trend_N1 - N1,
    Diff_N2 = Trend_N2 - N2,
    Diff_PMMOV = Trend_PMMOV - PMMOV,
    Diff_HF183 = Trend_HF183 - HF183,
    Diff_CrP = Trend_CrP - CrP
    )%>%
  select(date, site, Filter, Well, pop, Trend_N1:Diff_CrP, N1LOD, N2LOD)
log_trend_df <- hfg_waste_filt_df%>%
  mutate(log_N1 = log(N1),
         log_N2 = log(N2),
         log_PMMOV = log(PMMOV),
         log_HF183 = log(HF183),
         log_CrP = log(CrP))%>%
  group_by(site)%>%
  group_split()%>%
  lapply(loessSmoothMod, InVar = "log_N1", OutVar = "Trend_N1")%>%
  lapply(loessSmoothMod, InVar = "log_N2", OutVar = "Trend_N2")%>%
  lapply(loessSmoothMod, InVar = "log_PMMOV", OutVar = "Trend_PMMOV")%>%
  lapply(loessSmoothMod, InVar = "log_HF183", OutVar = "Trend_HF183")%>%
  lapply(loessSmoothMod, InVar = "log_CrP", OutVar = "Trend_CrP")%>%
  bind_rows()%>%
  mutate(
    Diff_N1 = Trend_N1 - log_N1,
    Diff_N2 = Trend_N2 - log_N2,
    Diff_PMMOV = Trend_PMMOV - log_PMMOV,
    Diff_HF183 = Trend_HF183 - log_HF183,
    Diff_CrP = Trend_CrP - log_CrP)%>%
  select(site, date, Filter, Well, pop, Trend_N1:Diff_CrP, N1LOD, N2LOD)


diff_norm <- function(df){
  df%>%
    mutate(across(Diff_N1:Diff_CrP, ~.x - mean(.x, na.rm = TRUE)),
                      across(Diff_N1:Diff_CrP, ~ifelse(is.finite(.x),.x,NA)))
}

diff_var <- function(df, name){
  df%>%
    ungroup()%>%
    summarise(across(Diff_N1:Diff_CrP, ~var(.x, na.rm = TRUE)))%>%
    mutate(var_type = name)
}
gen_vars <- function(df){
  trend_variance_df <- df%>%
    group_by(site)%>%
    diff_norm()

  filter_variance_df <- df%>%
    group_by(site, date)%>%
    diff_norm()
  
  well_variance_df <- df%>%
    group_by(site, date, Filter)%>%
    diff_norm()
  
  bind_DF <- rbind(diff_var(trend_variance_df, "trend var"),
                      diff_var(filter_variance_df, "filter var"),
                      diff_var(well_variance_df, "well var"))%>%
    pivot_longer(Diff_N1:Diff_CrP)%>%
    mutate(name = factor(name, levels=c('Diff_N1', 'Diff_N2', 'Diff_PMMOV',
                                        'Diff_HF183', 'Diff_CrP')),
           var_type = factor(var_type, levels = c("trend var", "filter var",
                                                  "well var")))
  levels(bind_DF$name) <- c("N1", "N2", "PMMoV", "HF183", "CrP")
  return(bind_DF)
}

gen_plot_heat <- function(df, title = NA){
  df%>%
    ggplot(aes(x = name, y = var_type)) +
    geom_tile(aes(fill = value)) + 
    geom_text(aes(label = round(value, 3))) +#formatC
    scale_fill_gradient(low = "white", high = "red")+
    scale_x_discrete(position = "top")+
    xlab("signal source")+
    ylab("variance source")+
    ggtitle(title)
}

gen_plot_hist <- function(df, title = NA){
  t_plot <- df%>%
    ggplot(aes(x = name, fill = var_type))+
    geom_col(aes(y = value), position="identity" ) +
    xlab("signal source")+
    ylab("Cumaltive Variance")+
    ggtitle(title)
  return(t_plot)
}
dis_df <- log_trend_df

var_output_base <- gen_vars(log_trend_df)

var_output_lod <- log_trend_df%>%
  filter(N2LOD | N1LOD)%>%
  gen_vars()


var_output_NLOD <- log_trend_df%>%
  filter(!(N2LOD | N1LOD))%>%
  gen_vars()


gen_plot_heat(var_output_base, "source of variance in HFG data")

gen_plot_heat(var_output_lod, "source of variance in above LOD info HFG data")

gen_plot_heat(var_output_NLOD, "source of variance in bellow LOD info HFG data")

gen_plot_hist(var_output_base, "source of variance in HFG data")

gen_plot_hist(var_output_lod, "source of variance in above LOD info HFG data")

gen_plot_hist(var_output_NLOD, "source of variance in bellow LOD info HFG data")

#Trend var: variance points of a day #variance of the signal
#Filter var: variance between collected points #collection variance
#well var: variance of 3 points #PCR test variance / extraction variance

#Gaussian mixture model
#change label to make more clear
#stacked in bar graph
#z = x + y
#var(z) = var(x) + var(y) if independent(
temp_func <- function(DF){
  g_ret <- DF%>%
    ggplot(aes(x = Diff_N1))+
    geom_histogram()
  return(g_ret)
}
temp_func(trend_variance_df)
temp_func(filter_variance_df)
temp_func(well_variance_df)

hist_plot <- ggplot(mapping = aes(x = Diff_N1))+
    geom_histogram(data = well_variance_df, fill = "green")+
    geom_histogram(data = filter_variance_df, fill = "blue")+
    geom_histogram(data = trend_variance_df, fill = "red")

plotly::ggplotly(hist_plot)